In this document we create different QC reports, including: (1) correlations between principal components and other technical or biological variables, denoted as the metadata variables, (2) correlations among the metadata variables, and (3) outlier detection. These tests are done separately for each dataset - a set of samples from a single site and a single tissue.

The RNA-seq analysis has two additional tests that are not a part of the standatd tests that we perform automatically for each assay: (1) sex checks, and (2) correlations between freeze times and sex.

Load the data and set the session

library(data.table);library(ggplot2);library(ggcorrplot);library(caret)
# load the data and functions to the session
repo_dir = '/oak/stanford/groups/smontgom/nicolerg/src/MOTRPAC/motrpac-bic-norm-qc'
data_dir = '/oak/stanford/groups/smontgom/nicolerg/MOTRPAC/PASS_ANALYSIS/PASS1B/RNA/differential_splicing/outputs/perind_numers_counts'
source(sprintf("%s/tools/unsupervised_normalization_functions.R", repo_dir))
source(sprintf("%s/tools/gcp_functions.R", repo_dir))
source(sprintf("%s/tools/qc_report_aux_functions.R", repo_dir))
source(sprintf("%s/tools/association_analysis_methods.R", repo_dir))

scatterplot_colors=c('paxgene rna'='#FF0000', # blood and paxgene rna are the same (RNA-seq)
             blood='#FF0000',
             'edta plasma'="#FF6161",
             heart='#EB7602',
             aorta="#B22222",
             spleen="#0000FF",
             gastrocnemius='#00CD00',
             'vastus lateralis'='#077D07',
             'white adipose'='#1E90FF',
             'brown adipose'='#FFA500',
             liver="#E349E3",
             lung="#14AE9E",
             kidney='#A020F0',
             adrenal="#DDA0DD",
             adrenals="#DDA0DD",
             cortex='#F2CA00',
             hypothalamus='#C1A100',
             hippocampus='#937B00',
             'small intestine'='#A18277',
             colon='#7D4C3B',
             ovaries="#FFAEB9",
             testes="#000080")

# # Data buckets
# main_bucket1 = "gs://motrpac-portal-transfer-stanford/Output/PASS1B/RNA-SEQ/batch3_20191016/results/"
# main_bucket2 = "gs://motrpac-portal-transfer-sinai/Output/rna-seq/rat/"
# buckets = data.frame(
#   site = c("stanford","sinai","sinai"),
#   bucket = c(main_bucket1,paste0(main_bucket2,"batch5_20191031/results/"),
#              paste0(main_bucket2,"batch4_20191021/results/")),
#   qc_file = c(
#     "rnaseq_pipeline_qc_metrics_batch3.csv",
#     "rnaseq_pipeline_qc_metrics_batch5.csv",
#     "rnaseq_pipeline_qc_metrics_batch4.csv"
#   ),
#   counts_file = rep("rsem_genes_count.txt",3),
#   fpkms_file = rep("rsem_genes_fpkm.txt",3),
#   stringsAsFactors = F
# )

# loading splicing results from local files for now
rnaseq_pipeline_data = list()
for(j in system(sprintf('ls %s',data_dir), intern=T)){
  file = sprintf('%s/%s', data_dir, j)
  tissue = gsub("_perind.*","",j)
  curr_counts = read.table(file, sep=' ', header=T, check.names=F)
  rnaseq_pipeline_data[[tissue]] = list(counts = curr_counts)
}

# # Read the gene expression data in
# rnaseq_pipeline_data = list()
# for(j in 1:nrow(buckets)){
#   obj = download_bucket_files_to_local_dir(buckets$bucket[j],local_path = data_dir,remove_prev_files = T)
#   curr_qc = fread(paste0(data_dir,buckets$qc_file[j]),data.table = F,stringsAsFactors = F)
#   curr_counts = fread(paste0(data_dir,buckets$counts_file[j]),data.table = F,stringsAsFactors = F)
#   curr_fpkms = fread(paste0(data_dir,buckets$fpkms_file[j]),data.table = F,stringsAsFactors = F)
#   rnaseq_pipeline_data[[j]] = list(
#     qc_scores = curr_qc,counts = curr_counts,fpkms = curr_fpkms
#   )
# }

# RNA-seq QC 
qc = dl_read_gcp("gs://motrpac-internal-release2-results/pass1b-06/transcriptomics/qa-qc/motrpac_pass1b-06_transcript-rna-seq_qa-qc-metrics.csv", sep=',')

# Metadata from DMAQC: use the merged data frame
dmaqc_metadata = 'gs://motrpac-internal-release2-results/pass1b-06/phenotype/motrpac_pass1b-06_pheno_viallabel-data.txt'
dmaqc_dict = 'gs://motrpac-internal-release2-results/pass1b-06/phenotype/motrpac_pass1b_pheno_merged-dictionary.txt'

# download and format phenotypic data 
dmaqc_metadata = dl_read_gcp(dmaqc_metadata, sep='\t')
cols = dl_read_gcp(dmaqc_dict, sep='\t')
old_cols = colnames(dmaqc_metadata)
new_cols = tolower(cols[match(old_cols, BICUniqueID), FullName]) 
colnames(dmaqc_metadata) = new_cols # this isn't perfect, but we don't care about the columns it doesn't work for for now

# make some variables human-readable
# create new variables "protocol", "agegroup", "intervention", "sacrificetime", "sex" with readable strings 
for (var in c('key.protocol','key.agegroup','key.intervention','key.sacrificetime','registration.sex')){
  d = cols[Field.Name == gsub('.*\\.','',var)]
  keys=unname(unlist(strsplit(d[,Categorical.Values],'\\|')))
  values=tolower(unname(unlist(strsplit(d[,Categorical.Definitions],'\\|'))))
  names(values) = keys
  # match keys to values; create new column 
  new_var = gsub(".*\\.","",var)
  dmaqc_metadata[,(new_var) := unname(values)[match(get(var), names(values))]]
}
dmaqc_metadata[,time_to_freeze := calculated.variables.frozetime_after_train - calculated.variables.deathtime_after_train]
# clean up "sacrificetime"
dmaqc_metadata[,sacrificetime := sapply(sacrificetime, function(x) gsub(' week.*','w',x))]
# clean up 'intervention'
dmaqc_metadata[grepl('training',intervention), intervention := 'training']
# make "group" - "1w", "2w", "4w", "8w", "control"
dmaqc_metadata[,group := sacrificetime]
dmaqc_metadata[intervention == 'control', group := 'control']
# make tech ID a string
dmaqc_metadata[,specimen.processing.techid := paste0('tech',specimen.processing.techid)]
dmaqc_metadata[,tissue := specimen.processing.sampletypedescription]

# merged_dmaqc_data = dl_read_gcp('gs://motrpac-internal-release2-results/pass1b-06/phenotype/motrpac_pass1b-06_pheno_viallabel-data.txt', sep='\t')
# dmaqc_dict = dl_read_gcp('gs://motrpac-internal-release2-results/pass1b-06/phenotype/motrpac_pass1b_pheno_merged-dictionary.txt', sep='\t')
# rownames(merged_dmaqc_data) = merged_dmaqc_data$viallabel
# # define the tissue variable
# merged_dmaqc_data$tissue = merged_dmaqc_data$sampletypedescription
# # define the time to freeze variable
# merged_dmaqc_data$time_to_freeze = merged_dmaqc_data$calculated.variables.time_death_to_collect_min + 
#   merged_dmaqc_data$calculated.variables.time_collect_to_freeze_min

# merge qc
rnaseq_meta = merge(dmaqc_metadata, qc, by.x='viallabel', by.y='vial_label')
rnaseq_meta = as.data.frame(rnaseq_meta)
colnames(rnaseq_meta) = gsub(" .*","",colnames(rnaseq_meta))
rownames(rnaseq_meta) = rnaseq_meta$viallabel
rnaseq_meta$animal.key.is_control = rnaseq_meta$key.intervention == 3

Metadata variables

MOP-flagging samples is skipped here because it's redundant with the pass1b_rnaseq QC report.

We next define the set of metadata variables. The first set is the set of variables from the data generating pipeline. The second set is of the biospecimens and is shared across all assays.

# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("RIN","reads",
                     "Lib_frag_size",
                     "RNA_extr_conc",
                     "pct_rRNA","pct_globin",
                     "pct_umi_dup","median_5_3_bias",
                     "pct_multimapped","pct_GC","pct_chrM")
biospec_cols = c("registration.sex","key.sacrificetime",
                  "animal.key.is_control",
                  "terminal.weight.bw","time_to_freeze",
                  "bid","pid")

Dataset normalization

We use log2 normalization for the intron cluster counts.

# Labels for separating the data by site and tissue
rnaseq_meta_batchs = unique(rnaseq_meta[,c("GET_site","Tissue")])
norm_rnaseq_data = list()
for(i in 1:nrow(rnaseq_meta_batchs)){
  curr_site = rnaseq_meta_batchs[i,1]
  curr_tissue = rnaseq_meta_batchs[i,2]
  curr_samples = as.character(
    rnaseq_meta$viallabel[rnaseq_meta$GET_site==curr_site &
    rnaseq_meta$Tissue==curr_tissue])
  curr_site = tolower(curr_site)
  curr_tissue = tolower(curr_tissue)
  curr_tissue = gsub("hypothalmus","hypothalamus",curr_tissue)
  # For rat data, take samples whose label id starts with "9" - remove qc pools
  curr_samples = curr_samples[grepl("^9",curr_samples)]
  
  # For normalized counts use:
  my_tissue = gsub(" ","_",gsub(" powder","",curr_tissue))
  curr_counts = rnaseq_pipeline_data[[my_tissue]]$counts
  if(is.null(curr_counts)){next}
  curr_data = log(curr_counts+1)
  
  # remove zero variance rows
  curr_data = curr_data[apply(curr_data,1,sd)>0,]
  # at this point, the highest variance intron clusters are on sex chromosomes. should I remove sex chroms?
  curr_meta1 = rnaseq_meta[curr_samples,pipeline_qc_cols]
  curr_meta2 = rnaseq_meta[curr_samples,biospec_cols]
  non_control_inds = (!curr_meta2$key.intervention == 3)
  curr_name = paste(curr_tissue,curr_site,sep=",")
  norm_rnaseq_data[[curr_name]] = list(log2 = curr_data,
      pipeline_meta = curr_meta1,dmaqc_meta = curr_meta2,
      non_control_inds=non_control_inds)
}
save(norm_rnaseq_data, file="/oak/stanford/groups/smontgom/nicolerg/MOTRPAC/PASS_ANALYSIS/PASS1B/RNA/differential_splicing/bic_qc_report/rdata/norm_rnaseq_data.RData")
# save_to_bucket(norm_rnaseq_data,file = "norm_rnaseq_data.RData",
#               bucket = "gs://bic_data_analysis/pass1a/rnaseq/")

PC-based association analysis

Here we first take each tissue and analyze its samples. For the top principal components (5) we compute their association with the metadata variables. We use Spearman correlation and a linear test for significance.

pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
p_thr = 0.0001
for(currname in names(norm_rnaseq_data)){
  curr_meta = cbind(norm_rnaseq_data[[currname]]$pipeline_meta,
                     norm_rnaseq_data[[currname]]$dmaqc_meta)
  # remove metadata variables with NAs
  curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
  # remove bid and pid
  curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
  # remove metadata variables with no variance
  curr_meta = curr_meta[,lapply(curr_meta, stats::var)!=0]
  
  # limit the data to the training rats and remove the is_control col
  training_rats = !curr_meta$animal.key.is_control
  curr_meta = curr_meta[training_rats,]
  curr_meta = curr_meta[,!grepl("is_control",names(curr_meta))]
  curr_data = norm_rnaseq_data[[currname]]$log2[,training_rats]
  # remove zero variance rows
  curr_data = curr_data[apply(curr_data,1,sd)>0,]
  
  curr_pca = prcomp(t(curr_data),scale. = T,center = T)
  curr_pcax = curr_pca$x[,1:5]
  explained_var = summary(curr_pca)[["importance"]][3,5]
  
  corrs = cor(curr_pcax,curr_meta,method="spearman")
  corrsp = pairwise_eval(
    curr_pcax,curr_meta,func=pairwise_association_wrapper,
    f=1)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
    ggtitle(currname) +
    theme(plot.title = element_text(hjust = 0.5,size=20)))
  
  for(i in 1:nrow(corrsp)){
    for(j in 1:ncol(corrsp)){
      if(corrsp[i,j]>p_thr){next}
      pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
            c(currname,
              rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
            )
    }
  }
  if(!is.null(pcs_vs_qc_var_report)){
      colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
                                  "qc_metric","rho(spearman)","p-value")
  }
}

# Format the reports - for a nicer presentation in a table
pcs_vs_qc_var_report[,5] = format(
  as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
  as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
  as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
  as.numeric(metadata_variable_assoc_report[,4]),digits=3)

PCA outliers

In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.

pca_outliers_report = c()
for(currname in names(norm_rnaseq_data)){
  curr_data = norm_rnaseq_data[[currname]]$log2
  curr_pca = prcomp(t(curr_data),scale. = T,center = T)
  curr_pcax = curr_pca$x[,1:5]
  explained_var = summary(curr_pca)[["importance"]][3,5]
  
  # Univariate: use IQRs
  pca_outliers = c()
  for(j in 1:ncol(curr_pcax)){
    outlier_values <- boxplot.stats(curr_pcax[,j],coef = 3)$out
    for(outlier in names(outlier_values)){
      pca_outliers_report = rbind(pca_outliers_report,
       c(currname,paste("PC",j,sep=""),outlier,
         format(outlier_values[outlier],digits=5))
      )
      if(!is.element(outlier,names(pca_outliers))){
        pca_outliers[outlier] = outlier_values[outlier]
      }
    }
  }
  
  # Plot the outliers
  if(length(pca_outliers)>0){
    # print(length(kNN_outliers))
    df = data.frame(curr_pcax,
                outliers = rownames(curr_pcax) %in% names(pca_outliers))
    col = rep("black",nrow(df))
    col[df$outliers] = "green"
    plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(currname,"flagged outliers"),
         xlab = "PC1",ylab="PC2")
    plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(currname,"flagged outliers"),
         xlab = "PC3",ylab="PC4")
  }
}

colnames(pca_outliers_report) =  c("dataset","PC","sample","score")